package au.com.acpfg.xml.query;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.RowIterator;
import org.knime.core.data.collection.ListCell;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import org.knime.core.node.defaultnodesettings.SettingsModelStringArray;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import com.fatdog.xmlEngine.DocItems;
import com.fatdog.xmlEngine.IntList;
import com.fatdog.xmlEngine.NodeTree;
import com.fatdog.xmlEngine.QueryDocumentTree;
import com.fatdog.xmlEngine.ResultList;
import com.fatdog.xmlEngine.XQEngine;
import com.fatdog.xmlEngine.exceptions.CantParseDocumentException;
import com.fatdog.xmlEngine.exceptions.MissingOrInvalidSaxParserException;
import au.com.acpfg.xml.query.XMLQueryEntry.ResultsType;
import au.com.acpfg.xml.query.XQueryReporter.QueryResponseFragmentType;
import au.com.acpfg.xml.reader.XMLCell;
/**
* This is the model implementation of XMLreader.
* Provides an XPath knime api & XML "blob" cell type and data processing. Useful for many life science XML formats (PepXML, ProtXML, etc. etc.)
*
* @author Andrew Cassin
*/
public class XQueryProcessorNodeModel extends NodeModel {
// the logger instance
private static final NodeLogger logger = NodeLogger
.getLogger(XQueryProcessorNodeModel.class);
/** the settings key which is used to retrieve and
store the settings (from the dialog or from a settings file)
(package visibility to be usable from the dialog). */
static final String CFGKEY_QUERIES = "dialog-xqueries"; // NB: must match Configure-Dialog class code!
static final String CFGKEY_XML_CELL_OUT = "xml-cell-output";
static final String CFGKEY_XML_COL = "xml-column";
// example value: the models count variable filled from the dialog
// and used in the models execution method. The default components of the
// dialog work with "SettingsModels".
private final SettingsModelString m_xml_col = new SettingsModelString(CFGKEY_XML_COL, "XML Data");
private final SettingsModelStringArray m_queries = new SettingsModelStringArray(CFGKEY_QUERIES, new String[] {});
//private final SettingsModelBoolean m_xml_out = new SettingsModelBoolean(CFGKEY_XML_CELL_OUT, false);
/* private state which does not require persistence */
private HashMap<Integer,DataColumnSpec> m_extra_cols;
private HashMap<ResultsType,String> m_rt_set; // maps between the type of column and the column name
/**
* Constructor for the node model.
*/
protected XQueryProcessorNodeModel() {
super(1, 1);
}
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
final ExecutionContext exec) throws Exception {
try {
ArrayList<XMLQueryEntry> enabled_queries = new ArrayList<XMLQueryEntry>();
for (String xqes : m_queries.getStringArrayValue()) {
XMLQueryEntry xqe = new XMLQueryEntry(xqes);
if (xqe.isEnabled()) {
enabled_queries.add(xqe);
}
}
// make output columns and ensure reporters are bound to each column for results desired by user
DataColumnSpec[] output_cols = make_output_cols(enabled_queries);
DataTableSpec outputSpec = new DataTableSpec(output_cols);
XQueryReporter r = new XQueryReporter();
if (m_rt_set.containsKey(ResultsType.RAW_XML))
new StringReporter(r, m_rt_set.get(ResultsType.RAW_XML));
if (m_rt_set.containsKey(ResultsType.ELEMENT_DISTRIBUTION))
new ElementDistributionReporter(r, m_rt_set.get(ResultsType.ELEMENT_DISTRIBUTION));
if (m_rt_set.containsKey(ResultsType.RAW_XML_COLLECTION))
new ElementCollectionReporter(r, m_rt_set.get(ResultsType.RAW_XML_COLLECTION));
if (m_rt_set.containsKey(ResultsType.XMLATTR_COLLECTION))
new AttributeCollectionReporter(r, m_rt_set.get(ResultsType.XMLATTR_COLLECTION));
if (m_rt_set.containsKey(ResultsType.TEXT))
new TextReporter(r, m_rt_set.get(ResultsType.TEXT));
// make output container
BufferedDataContainer container = exec.createDataContainer(outputSpec, true, 100);
// and find the column id for the XML data to query...
int xml_col_idx = inData[0].getDataTableSpec().findColumnIndex(m_xml_col.getStringValue());
if (xml_col_idx < 0) {
throw new Exception("Cannot locate column with XML data: "+m_xml_col.getStringValue());
}
int hit = 0;
if (enabled_queries.size() < 1) {
throw new Exception("Nothing to search - no enabled user queries specified... please re-configure!");
}
RowIterator it = inData[0].iterator();
while (it.hasNext()) {
DataRow row = it.next();
DataCell xml_cell = row.getCell(xml_col_idx);
if (xml_cell == null || xml_cell.isMissing())
continue;
// this should not happen in practice -- programmer error!
if (!(xml_cell instanceof XMLCell)) {
continue;
}
// need to create a new engine each time, so java GC can release the AST from the previous file...
XMLCell xc = (XMLCell) xml_cell;
XQEngine e = new XQEngine();
e.setXMLReader(xc.getReader(true)); // always strip namespaces for now..
e.setDocument(xc.asFile().getAbsolutePath());
for (XMLQueryEntry xqe : enabled_queries) {
String path = xqe.getQuery();
if (path.length() > 0) {
ResultList results = e.setQuery(path);
DocItems di = results.nextDocument();
if (di == null)
continue;
logger.info("Got "+results.getNumValidItems()+" valid hits (total " + results.getNumTotalItems()+") for "+row.getKey().getString()+ ", query="+path);
if (xqe.getFailEmpty() && results.getNumValidItems() == 0) {
throw new FailedPathException(row.getKey().getString(), "No matches for "+xqe.getName());
} else {
Set<ResultsType> wanted_set = xqe.getWantedResultsSet();
int num_valid_items = results.getNumValidItems();
String rkey = row.getKey().getString();
// user only want one row for the query?
if (wanted_set.size() == 1 && wanted_set.contains(ResultsType.RESULTS_COUNT)) {
DataCell[] cells = getResultCount(rkey, path, num_valid_items, output_cols.length);
container.addRowToTable(new DefaultRow("Hit"+hit++, cells));
} else {
// report each XQuery result, for every hit in the current file
getXML(di, results, r);
DataCell[] r2 = getRowCells(r, rkey, path, num_valid_items, output_cols.length);
container.addRowToTable(new DefaultRow("Hit"+hit++, r2));
r.reset(); // reset traversal between hits
}
logger.info("Completed processing query: "+path);
}
//results.removeDocument(m_di);
results = null;
}
}
e = null; // let GC do its thing...
}
// once we are done, we close the container and return its table
container.close();
BufferedDataTable out = container.getTable();
return new BufferedDataTable[]{out};
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
protected DataCell[] getResultCount(String filename, String path, int valid_items, int ncols) {
DataCell[] cells = new DataCell[ncols];
for (int i=0; i<ncols; i++) {
cells[i] = DataType.getMissingCell();
}
cells[0] = new StringCell(filename);
cells[1] = new StringCell(path);
for (Integer colidx : m_extra_cols.keySet()) {
String col_name = m_extra_cols.get(colidx).getName();
if (col_name.equals("Result Count")) {
cells[colidx.intValue()] = new IntCell(valid_items);
}
// else... do nothing since it will have a missing value if not supported
}
return cells;
}
protected DataCell[] getRowCells(XQueryReporter r, String filename,
String path, int valid_items, int ncols) {
DataCell[] cells = new DataCell[ncols];
for (int i=0; i<ncols; i++) {
cells[i] = DataType.getMissingCell();
}
cells[0] = new StringCell(filename);
cells[1] = new StringCell(path);
for (Integer colidx : m_extra_cols.keySet()) {
String col_name = m_extra_cols.get(colidx).getName();
cells[colidx.intValue()] = r.getResultCell(col_name);
}
return cells;
}
protected DataColumnSpec[] make_output_cols(List<XMLQueryEntry> enabled_queries) throws Exception {
// compute m_rt_set and m_extra_cols based on supplied queries
m_rt_set = new HashMap<ResultsType,String>();
m_extra_cols = new HashMap<Integer,DataColumnSpec>();
int cols = 2;
DataType dt = ListCell.getCollectionType(StringCell.TYPE);
for (XMLQueryEntry xqe : enabled_queries) {
ResultsType[] wanted = xqe.getWantedResults();
DataColumnSpec col;
for (ResultsType rt : wanted) {
if (m_rt_set.containsKey(rt))
continue;
String colname = XMLQueryEntry.colname(rt);
m_rt_set.put(rt,colname);
if (rt == ResultsType.RAW_XML) {
col = new DataColumnSpecCreator(colname, XMLCell.TYPE).createSpec();
} else if (rt == ResultsType.RAW_XML_COLLECTION) {
col = new DataColumnSpecCreator(colname, dt).createSpec();
} else if (rt == ResultsType.RESULTS_COUNT) {
col = new DataColumnSpecCreator(colname, IntCell.TYPE).createSpec();
} else if (rt == ResultsType.TEXT) {
col = new DataColumnSpecCreator(colname, StringCell.TYPE).createSpec();
} else if (rt == ResultsType.TEXT_COLLECTION) {
col = new DataColumnSpecCreator(colname, dt).createSpec();
} else if (rt == ResultsType.XMLATTR_COLLECTION) {
col = new DataColumnSpecCreator(colname, dt).createSpec();
} else if (rt == ResultsType.ELEMENTS_AS_COLUMNS) {
// throw... need to compute this
throw new Exception("TODO... not implemented!");
} else if (rt == ResultsType.ELEMENT_DISTRIBUTION) {
col = new DataColumnSpecCreator(colname,dt).createSpec();
} else {
throw new Exception("Unsupported result type: "+rt);
}
m_extra_cols.put(new Integer(cols++), col);
}
}
DataColumnSpec[] allColSpecs = new DataColumnSpec[cols];
allColSpecs[0] = new DataColumnSpecCreator("Filename", StringCell.TYPE).createSpec();
allColSpecs[1] = new DataColumnSpecCreator("XQuery", StringCell.TYPE).createSpec();
for (Integer col_idx : m_extra_cols.keySet()) {
allColSpecs[col_idx.intValue()] = m_extra_cols.get(col_idx);
}
return allColSpecs;
}
/**
* {@inheritDoc}
*/
@Override
protected void reset() {
// TODO Code executed on reset.
// Models build during execute are cleared here.
// Also data handled in load/saveInternals will be erased here.
}
/**
* {@inheritDoc}
*/
@Override
protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
throws InvalidSettingsException {
// TODO: check if user settings are available, fit to the incoming
// table structure, and the incoming types are feasible for the node
// to execute. If the node can execute in its current state return
// the spec of its output data table(s) (if you can, otherwise an array
// with null elements), or throw an exception with a useful user message
return new DataTableSpec[]{null};
}
/**
* {@inheritDoc}
*/
@Override
protected void saveSettingsTo(final NodeSettingsWO settings) {
m_xml_col.saveSettingsTo(settings);
settings.addStringArray(CFGKEY_QUERIES, m_queries.getStringArrayValue());
}
/**
* {@inheritDoc}
*/
@Override
protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
throws InvalidSettingsException {
String[] vec = settings.getStringArray(CFGKEY_QUERIES);
m_queries.setStringArrayValue(vec);
m_xml_col.loadSettingsFrom(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void validateSettings(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_xml_col.validateSettings(settings);
String[] queries = settings.getStringArray(CFGKEY_QUERIES);
XQEngine engine = new XQEngine();
try {
XMLReader rdr = XMLReaderFactory.createXMLReader();
engine.setXMLReader(rdr);
engine.setExplicitDocument("<?xml version=\"1.0\" encoding=\"utf-8\"?><test></test>");
} catch (CantParseDocumentException e1) {
e1.printStackTrace();
} catch (MissingOrInvalidSaxParserException e1) {
throw new InvalidSettingsException("No SAX compliant XML parser available! Check your KNIME/Java installation!");
} catch (SAXException e) {
throw new InvalidSettingsException(e.getMessage());
}
for (String e : queries) {
XMLQueryEntry xqe = new XMLQueryEntry(e);
try {
engine.setQuery(xqe.getQuery());
} catch (Exception exc) {
throw new InvalidSettingsException("Invalid query: "+xqe.getName()+", please fix the query."+
exc.getMessage()+
xqe.getQuery()
);
}
}
}
/**
* {@inheritDoc}
*/
@Override
protected void loadInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
}
/**
* {@inheritDoc}
*/
@Override
protected void saveInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
}
public void getXML(DocItems di, ResultList rl, XQueryReporter sb ) {
assert(di != null && rl != null && sb != null);
if ( di.getNumValidItems() == 0 )
return;
IntList nodes = di.getIntList();
NodeTree tree = di.getTree();
for( int i = 0; i < nodes.count(); i++ )
{
int type = nodes.getRef_2( i );
if ( type == DocItems.VOIDED_NODE )
continue;
int value = nodes.getRef_1( i );
if ( type >= DocItems.ELEM ) {
getTreeXML(tree, value, sb);
}
else
switch( type )
{
case DocItems.DOC_NODE :
getTreeXML(tree, 0, sb);
break;
case DocItems.INT :
sb.call(QueryResponseFragmentType.RESP_INT, value);
break;
case DocItems.STRING :
// BUG: needs to use di.m_indexer but thats not accessible, so use rl.getIndexer() instead...
String string = rl.getIndexer().getCurrTreeWalker().getStringResult( value );
sb.call(string);
break;
case DocItems.BOOLEAN :
sb.call((value == 0)? "false" : "true");
break;
case DocItems.ATTR_TEXT :
sb.call(tree.getAttributeText( value ));
break;
case DocItems.TEXT_AS_STRING : // was TEXT, but string() function changed it to this
sb.call(tree.getElementText( value ));
break;
/*
case -8 :
// BUG: should also use DocItems.m_indexer
sb.call(rl.getIndexer().getCurrTreeWalker().getDouble( value ));
break;
case -9 :
// BUG: should also use DocItems.m_indexer
sb.call( rl.getIndexer().getCurrTreeWalker().getDecimal( value ));
break;
*/
default :
throw new IllegalArgumentException("\nDocumentItems:emitXml(): unknown item type " + type );
}
}
}
protected void getTreeXML(NodeTree self, int node, XQueryReporter sb) {
assert(self != null && sb != null);
switch( self.getType( node ))
{
case DocItems.DOC_NODE :
emitElementNode( self, node + 1, sb, false);
break;
case NodeTree.ELEM :
case QueryDocumentTree.ELEMENT_CTOR :
boolean hoist = hoistAttributeNodesInEnclosedContent(self, node+1);
emitElementNode( self, node, sb, hoist);
break;
case NodeTree.ATTR :
emitAttributeNode( self, node, sb );
break;
case NodeTree.TEXT :
emitTextNode( self, node, sb );
break;
default :
throw new IllegalArgumentException( "getTreeXML() unknown toplevel nodetype: " + self.getType( node ));
}
}
protected boolean hoistAttributeNodesInEnclosedContent(NodeTree self, int attrNode) {
if ( attrNode < self.getNodeCount() )
if ( self.getType( attrNode ) == QueryDocumentTree. ENCLOSED_RESULTS )
if ( ( (QueryDocumentTree)self).getEnclosedResults( attrNode ).isAttributesOnly() )
return true;
return false;
}
protected void emitAttributes(ResultList rl, XQueryReporter sb) {
DocItems doc;
while (( doc = rl.nextDocument()) != null ) {
NodeTree tree = doc.getTree();
IntList list = doc.getIntList();
for( int i = 0; i < list.count(); i++ ) {
int node = list.getRef_1(i);
if ( tree.getType( node ) == NodeTree.ATTR ) {
rl.updateValidItemCount( -1 );
doc.updateValidItemCount( -1 );
list.setRef_2( i, DocItems.VOIDED_NODE );
sb.call(QueryResponseFragmentType.RESP_ATTRIBUTE,
tree.getAttributeName( node ), tree.getAttributeText( node ));
}
}
}
}
protected int emitElementNode( NodeTree self, int node, XQueryReporter sb, boolean hoistAttributes)
{
int myself = node;
String elementName = self.getElementName( node );
int num_nodes = self.getNodeCount();
sb.call(XQueryReporter.QueryResponseFragmentType.RESP_START_ELEMENT, elementName);
while( ++ node < num_nodes && self.getType( node ) == NodeTree.ATTR )
{
emitAttributeNode( self, node, sb );
}
if (hoistAttributes) {
emitAttributes(((QueryDocumentTree)self).getEnclosedResults(node), sb);
}
if ( node >= num_nodes || self.getParent( node ) != myself )
{
sb.call(XQueryReporter.QueryResponseFragmentType.RESP_INCOMPLETE_END_ELEMENT, "");
return node;
}
else
sb.call(XQueryReporter.QueryResponseFragmentType.RESP_END_TAG, ""); // just the closing angle bracket to the start tag
while( node < num_nodes && self.getParent( node ) == myself )
{
int nodeType = self.getType( node );
switch( nodeType )
{
case NodeTree.ELEM :
boolean hoist = hoistAttributeNodesInEnclosedContent(self, node+1);
node = emitElementNode( self, node, sb, hoist );
break;
case NodeTree.TEXT :
node = emitTextNode( self, node, sb );
break;
case QueryDocumentTree. ELEMENT_CTOR : // skip if encountered in element content
// (only emitted indirectly via ENCLOSED_RESULTS below)
int parent = node;
while( ++ node < num_nodes && self.getParent(node) == parent ) { }
break;
case QueryDocumentTree. RESERVED :
case QueryDocumentTree. ENCLOSED_RESULTS :
if ( ! (self instanceof QueryDocumentTree) )
throw new IllegalArgumentException("emitElementNode(): unknown nodetype " + self.getType( node ));
if ( nodeType == QueryDocumentTree. RESERVED )
sb.call( "Reserved" );
else
getNestedResults(((QueryDocumentTree) self).getEnclosedResults( node ), sb);
++ node;
break;
default :
throw new IllegalArgumentException("emitElementNode(): unknown nodetype " + self.getType( node ));
}
}
sb.call(QueryResponseFragmentType.RESP_END_ELEMENT_TAG, elementName);
return node;
}
// trying to be equivalent to ResultList.emitXml() in XQEngine codebase
private void getNestedResults(ResultList enclosedResults, XQueryReporter sb) {
if (enclosedResults.getNumValidItems() < 0)
return;
enclosedResults.resetDocumentIterator();
DocItems doc;
while ((doc = enclosedResults.nextDocument()) != null) {
// this call trying to be equivalent to DocItems.emitXml()
getXML(doc, enclosedResults, sb );
}
}
private int emitTextNode(NodeTree self, int node, XQueryReporter sb) {
String text = self.getElementText( node );
sb.call( text );
return ++ node;
}
private void emitAttributeNode(NodeTree self, int node, XQueryReporter sb) {
String name = self.getAttributeName( node );
String attrValue = self.getAttributeText( node );
sb.call(XQueryReporter.QueryResponseFragmentType.RESP_ATTRIBUTE, name, attrValue);
}
}